Expression analysis was performed on GEO dataset GSE15745 ‘Abundant Quantitative Trait Loci for CpG Methylation and Expression Across Human Brain Tissues’, which includes mRNA expression data for of 22,184 genes obtained from human brain tissues of 150 non-autistic individuals using Illumina humanRef-8 v2.0 expression beadchip. The study provides data from 4 different brain tissues: cerebellum (CRBLM), frontal cortex (FCTX), pons (PONS) and temporal cortex (TCTX). Samples originated from 3 distinct tissue banks: Baltimore Longitudinal Study of Aging (BLSA) from neuropathology department of John Hopkins hospital, non-BLSA samples from the same hospital (JHU) and NICHD brain and tissue bank for developmental disorders from University of Mariland medical school (UMARY). In addition, genome-wide SNP genotyping data of these subjects was obtained from dbGap, study accession phs000249.v1.p1, and included information for 561466 SNPs genotyped with Illumina HumanHap550v3.0 SNP array. Inversion genotypes were obtained with invClust
First, let us load the ExpressionSet with the expression data and the table with the inversion genotypes from invClust
load('DE/brain.Rdata')
load('DE/inversions.RData')
Then, we perform normalization between arrays
exprsBrain <- exprs(brainExpr)
exprsBrain.norm <- log2(limma::normalizeBetweenArrays(exprsBrain))
brainExpr.norm <- brainExpr
exprs(brainExpr.norm) <- exprsBrain.norm
sampleNames(brainExpr.norm) <- colnames(exprsBrain)
pp <- pData(brainExpr.norm)
pp$id <- unlist(lapply(strsplit(as.character(pp$title), "-"),
function(x) paste(x[1:2], collapse="-")))
pp2 <- left_join(pp, inversions, by=c("id"="SUBJID"))
rownames(pp2) <- pp2$geo_accession
pData(brainExpr.norm) <- pp2
And after that let us run differential expression analysis, one per region and inversion. Here we show the process for inv1_004
regions <- c("CRBLM", "FCTX", "PONS", "TCTX")
res <- list()
brainExpr.norm$inv <- additive(snp(pData(brainExpr.norm)[,'inv1_004']))
for (i in 1:4){
ii <- grep(regions[i], pData(brainExpr.norm)$`source_name_ch1`)
eSet <- brainExpr.norm[, ii]
ans <- runPipeline(eSet, "inv", sva=TRUE)
res[[i]] <- getProbeResults(ans, fNames = c("Symbol", "Chromosome", "Probe_Coordinates"))
}
names(res) <- regions
Finally, we extract the coordinates of each probe and keep those that overlap with the inversion
startstop <- function(x, pos){
x <- as.character(x)
if (x == ""){
return(0)
}
else{
coords <- c()
splits <- strsplit(x, ':')[[1]]
if (length(splits) == 1) {
coords[1] <- strsplit(splits[1], '-')[[1]][1]
coords[2] <- strsplit(splits[1], '-')[[1]][2]
}
else if (length(splits) > 1){
coords[1] <- strsplit(splits[1], '-')[[1]][1]
coords[2] <- strsplit(splits[length(splits)], '-')[[1]][2]
}
if (pos == 'start'){
return(min(as.numeric(coords[1]), as.numeric(coords[2])))
}
else if (pos == 'stop'){
return(max(as.numeric(coords[1]), as.numeric(coords[2])))
}
}
}
roi <- read.delim('DE/ROI.txt')
invinfo <- roi[roi$reg == 'inv1_004',]
invrange <- GRanges(seqnames = invinfo$chr, ranges = IRanges(start = invinfo$LBP-500000, end = invinfo$RBP+500000))
for (i in 1:4){
res.region <- res[[i]]
starts <- unlist(lapply(res.region$Probe_Coordinates, startstop, pos = 'start'))
stops <- unlist(lapply(res.region$Probe_Coordinates, startstop, pos = 'stop'))
ID <- res.region$Symbol
chr <- res.region$Chromosome
chr <- as.vector(chr)
chr[chr == ""] <- 0
chr <- as.factor(chr)
rangesdf <- data.frame(ID, chr, starts, stops)
granges <- makeGRangesFromDataFrame(rangesdf,
keep.extra.columns=FALSE,
ignore.strand=TRUE,
seqinfo=NULL,
seqnames.field="chr",
start.field="starts",
end.field="stops"
)
names(granges) <- rangesdf$ID
genesinv <- names(subsetByOverlaps(granges, invrange))
invres <- paste0(regions[i], '.', 'inv1_004')
assign(invres, subset(res.region, Symbol %in% genesinv))
}
CRBLM.inv1_004$Region <- c('CRBLM')
FCTX.inv1_004$Region <- c('FCTX')
TCTX.inv1_004$Region <- c('TCTX')
PONS.inv1_004$Region <- c('PONS')
allres <- rbind(CRBLM.inv1_004, FCTX.inv1_004, TCTX.inv1_004, PONS.inv1_004)
allres <- allres[order(allres$P.Value),]
Significant genes in inv1_004 are
topres <- allres[allres$P.Value < 0.05, c(6, 10, 12, 13)]
topres
## P.Value Symbol Probe_Coordinates Region
## ILMN_16577542 0.01440123 C1orf82 92626087-92626136 TCTX
## ILMN_16564502 0.01467792 BRDT 92190334-92190383 TCTX
## ILMN_17472852 0.01602019 HFM1 91615678-91615727 TCTX
## ILMN_17418011 0.01968234 CDC7 91763635-91763684 FCTX
## ILMN_16825773 0.03996710 FAM69A 93071130-93071164:93071165-93071179 PONS
## ILMN_16564503 0.04775801 BRDT 92190334-92190383 PONS
## ILMN_17301182 0.04901165 ZNF644 91153757-91153806 TCTX
For each of the significant genes we can make a boxplot (Figure 1 shows an example with C1orf82)
gene <- 'C1orf82'
id <- as.character(fData(brainExpr.norm)$ID[fData(brainExpr.norm)$Symbol == gene & fData(brainExpr.norm)$Probe_Coordinates == topres$Probe_Coordinates[topres$Symbol == gene]])
expr <- as.data.frame(exprs(brainExpr.norm)[id,], drop = F)
names(expr) <- 'Expression'
pdatasub <- pData(brainExpr.norm)[,c('source_name_ch1', 'inv1_004')]
names(pdatasub) <- c('Region', 'Inversion_Genotype')
plotdata <- merge(pdatasub, expr, by.x = 0, by.y = 0)
plotdata <- plotdata[!is.na(plotdata$Inversion_Genotype),]
plotdata$Region <- as.vector(plotdata$Region)
plotdata$Region[plotdata$Region == 'Human Brain Tissue: CRBLM'] <- 'CRBLM'
plotdata$Region[plotdata$Region == 'Human Brain Tissue: FCTX'] <- 'FCTX'
plotdata$Region[plotdata$Region == 'Human Brain Tissue: TCTX'] <- 'TCTX'
plotdata$Region[plotdata$Region == 'Human Brain Tissue: PONS'] <- 'PONS'
plotdata$Region <- as.factor(plotdata$Region)
pval <- data.frame(allres[c(allres$Symbol == gene &
allres$Probe_Coordinates %in% topres$Probe_Coordinates), c(6, 10, 13)])
pval$x <- Inf
pval$y <- Inf
labs <- unlist(lapply(pval$`P.Value`, function(x) paste0('p-value = ', formatC(x, digits = 2, format = 'e'))))
pval$lab <- labs
ggplot(plotdata, aes(x=Inversion_Genotype, y=Expression, group = Inversion_Genotype)) +
theme(legend.position = 'none',
plot.title = element_text(size = 20, family = 'Helvetica'),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 12, margin = margin(t = 10, r = 0, b = 0, l = 0)),
axis.title.y = element_text(size = 12, margin = margin(t = 0, r = 10, b = 0, l = 0))) +
ggtitle(gene) +
ylab('Normalized expression') +
xlab('Inversion genotype') +
geom_boxplot(aes(fill = Inversion_Genotype)) +
scale_fill_manual(values = c('darkseagreen1', 'darkseagreen3', 'darkseagreen4')) +
facet_grid(. ~ Region) +
theme(strip.text.x = element_text(size = 15, face = 'bold', family = 'Helvetica'),
plot.margin = margin(t = 15, r = 10, b = 5, l = 10)) +
geom_label(data = data.frame(x = -Inf, y = Inf, lab = pval$lab, Region = pval$Region), aes(x, y, label = lab, group=NULL), hjust = 0, vjust = 1)
Figure 1: C1or82 brain expression between inversion inv1_004 genotypes
Plots for all significative genes in all genotyped inversions (Figures 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 and 16)
Figure 2: Top genes for inv1_004
Figure 3: Top genes for inv2_013
Figure 4: Top genes for inv3_003
Figure 5: Top genes for inv6_002
Figure 6: Top genes for inv6_006
Figure 7: Top genes for inv7_005
Figure 8: Top genes for inv7_011
Figure 9: Top genes for inv8_001
Figure 10: Top genes for inv10_005
Figure 11: Top genes for inv11_004
Figure 12: Top genes for inv12_004
Figure 13: Top genes for inv12_006
Figure 14: Top genes for inv14_005
Figure 15: Top genes for inv16_009
Figure 16: Top genes for inv17_007
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /home/isglobal.lan/lbalague/miniconda2/envs/brain/lib/libopenblasp-r0.3.9.so
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C LC_TIME=es_ES.UTF-8 LC_COLLATE=es_ES.UTF-8 LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=es_ES.UTF-8 LC_PAPER=es_ES.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] png_0.1-7 GenomicRanges_1.40.0 GenomeInfoDb_1.24.0 IRanges_2.22.2 S4Vectors_0.26.1 gridExtra_2.3 limma_3.44.1 SNPassoc_1.9-2 mvtnorm_1.1-1 survival_3.1-12 haplo.stats_1.7.9 MEAL_1.18.0 MultiDataSet_1.16.0 Biobase_2.48.0 BiocGenerics_0.34.0 invClust_1.0 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4 readr_1.3.1 tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.1 tidyverse_1.3.0 BiocStyle_2.16.0
##
## loaded via a namespace (and not attached):
## [1] rms_6.0-0 tidyselect_1.1.0 RSQLite_2.2.0 AnnotationDbi_1.50.0 htmlwidgets_1.5.1 grid_4.0.0 BiocParallel_1.22.0 munsell_0.5.0 codetools_0.2-16 preprocessCore_1.50.0 withr_2.2.0 fastICA_1.2-2 colorspace_1.4-1 highr_0.8 knitr_1.28 rstudioapi_0.11 labeling_0.3 JADE_2.0-3 isva_1.9 GenomeInfoDbData_1.2.3 farver_2.0.3 bit64_0.9-7 rhdf5_2.32.0 TH.data_1.0-10 vctrs_0.3.1 generics_0.0.2 xfun_0.14 qqman_0.1.4 BiocFileCache_1.12.0 R6_2.4.1 clue_0.3-57 illuminaio_0.30.0 locfit_1.5-9.4 bitops_1.0-6 reshape_0.8.8
## [36] DelayedArray_0.14.0 assertthat_0.2.1 scales_1.1.1 multcomp_1.4-13 nnet_7.3-14 gtable_0.3.0 sva_3.36.0 sandwich_2.5-1 MatrixModels_0.4-1 rlang_0.4.6 genefilter_1.70.0 calibrate_1.7.5 splines_4.0.0 rtracklayer_1.48.0 acepack_1.4.1 GEOquery_2.56.0 broom_0.5.6 checkmate_2.0.0 reshape2_1.4.4 BiocManager_1.30.10 yaml_2.2.1 modelr_0.1.8 snpStats_1.38.0 GenomicFeatures_1.40.0 backports_1.1.7 qvalue_2.20.0 Hmisc_4.4-0 tools_4.0.0 bookdown_0.19 nor1mix_1.3-0 ellipsis_0.3.1 RColorBrewer_1.1-2 siggenes_1.62.0 Rcpp_1.0.4.6 plyr_1.8.6
## [71] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.34.0 RCurl_1.98-1.2 prettyunits_1.1.1 rpart_4.1-15 openssl_1.4.1 bumphunter_1.30.0 zoo_1.8-8 SummarizedExperiment_1.18.1 haven_2.3.1 cluster_2.1.0 fs_1.4.1 magrittr_1.5 magick_2.3 RSpectra_0.16-0 data.table_1.12.8 SparseM_1.78 reprex_0.3.0 matrixStats_0.56.0 hms_0.5.3 evaluate_0.14 xtable_1.8-4 XML_3.99-0.3 jpeg_0.1-8.1 mclust_5.4.6 readxl_1.3.1 compiler_4.0.0 biomaRt_2.44.0 minfi_1.34.0 crayon_1.3.4 htmltools_0.4.0 mgcv_1.8-31 Formula_1.2-3 lubridate_1.7.8
## [106] DBI_1.1.0 dbplyr_1.4.4 MASS_7.3-51.6 rappdirs_0.3.1 Matrix_1.2-18 permute_0.9-5 cli_2.0.2 quadprog_1.5-8 pkgconfig_2.0.3 GenomicAlignments_1.24.0 foreign_0.8-80 SmartSVA_0.1.3 xml2_1.3.2 foreach_1.5.0 annotate_1.66.0 rngtools_1.5 multtest_2.44.0 beanplot_1.2 XVector_0.28.0 rvest_0.3.5 doRNG_1.8.2 scrime_1.3.5 digest_0.6.25 vegan_2.5-6 Biostrings_2.56.0 rmarkdown_2.3 base64_2.0 cellranger_1.1.0 htmlTable_1.13.3 edgeR_3.30.3 DelayedMatrixStats_1.10.0 curl_4.3 quantreg_5.55 Rsamtools_2.4.0 lifecycle_0.2.0
## [141] nlme_3.1-148 jsonlite_1.6.1 Rhdf5lib_1.10.0 askpass_1.1 fansi_0.4.1 pillar_1.4.4 lattice_0.20-41 httr_1.4.1 glue_1.4.1 iterators_1.0.12 bit_1.1-15.2 class_7.3-17 stringi_1.4.6 HDF5Array_1.16.0 blob_1.2.1 polspline_1.1.19 latticeExtra_0.6-29 memoise_1.1.0 e1071_1.7-3